# Setup.R file
source("https://bit.ly/2i8sicn")
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:lubridate':
##
## intersect, setdiff, union
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, unionFormats xlsx and csv - what’s the difference?
csv is an extension for Comma Separated Value files. They are text files - directly readable.
Example: Gas prices in midwest since 1994
midwest <- read.csv("https://bit.ly/2hwWiQ8")
head(midwest)
## Year.Month Week.1 X Week.2 X.1 Week.3 X.2 Week.4 X.3
## 1 End Date Value End Date Value End Date Value End Date Value
## 2 1994-Nov 28-Nov 1.122
## 3 1994-Dec 5-Dec 1.086 12-Dec 1.057 19-Dec 1.039 26-Dec 1.027
## 4 1995-Jan 2-Jan 1.025 9-Jan 1.046 16-Jan 1.031 23-Jan 1.054
## 5 1995-Feb 6-Feb 1.045 13-Feb 1.04 20-Feb 1.031 27-Feb 1.052
## 6 1995-Mar 6-Mar 1.053 13-Mar 1.042 20-Mar 1.048 27-Mar 1.065
## Week.5 X.4
## 1 End Date Value
## 2
## 3
## 4 30-Jan 1.055
## 5
## 6str(midwest)
## 'data.frame': 212 obs. of 11 variables:
## $ Year.Month: Factor w/ 212 levels "","1994-Dec",..: 1 3 2 8 7 11 4 12 10 9 ...
## $ Week.1 : Factor w/ 86 levels "","1-Apr","1-Aug",..: 86 1 52 18 65 69 26 10 56 31 ...
## $ X : Factor w/ 197 levels "","0.905","0.918",..: 197 1 19 7 12 13 21 29 42 31 ...
## $ Week.2 : Factor w/ 86 levels "","10-Apr","10-Aug",..: 86 1 28 78 41 45 2 70 32 7 ...
## $ X.1 : Factor w/ 206 levels "","0.919","0.921",..: 206 1 17 14 12 13 27 39 45 34 ...
## $ Week.3 : Factor w/ 86 levels "","15-Apr","15-Aug",..: 86 1 52 18 65 69 26 10 56 31 ...
## $ X.2 : Factor w/ 199 levels "","0.91","0.929",..: 199 1 11 9 9 15 28 40 38 29 ...
## $ Week.4 : Factor w/ 85 levels "22-Apr","22-Aug",..: 85 82 51 17 64 68 25 9 55 30 ...
## $ X.3 : Factor w/ 201 levels "0.883","0.921",..: 201 29 9 14 13 15 32 44 34 27 ...
## $ Week.5 : Factor w/ 31 levels "","29-Apr","29-Aug",..: 31 1 1 16 1 1 1 9 1 27 ...
## $ X.4 : Factor w/ 74 levels "","0.955","1.023",..: 74 1 1 5 1 1 1 18 1 11 ...There is clearly some work to be done…
read.csv vs. read.tableread.csv is really just a wrapper for read.table with certain parameters set:
read.csv
## function (file, header = TRUE, sep = ",", quote = "\"", dec = ".",
## fill = TRUE, comment.char = "", ...)
## read.table(file = file, header = header, sep = sep, quote = quote,
## dec = dec, fill = fill, comment.char = comment.char, ...)
## <bytecode: 0x50e0478>
## <environment: namespace:utils>So to properly read in this data, it’s probably best to use read.table directly
?read.tableHave a look at the parameters of read.table (?read.table) to solve the following problems:
midwest_namesmidwest_datamidwest_namesmidwest_names <- read.table("https://bit.ly/2hwWiQ8",
nrows = 2, sep = ",",
stringsAsFactors = FALSE)midwest_datamidwest_data <- read.table("https://bit.ly/2hwWiQ8",
skip = 2, sep = ",",
stringsAsFactors = FALSE)values <- c(midwest_data$V3, midwest_data$V5, midwest_data$V7,
midwest_data$V9, midwest_data$V11)
dates <- c(paste(midwest_data$V1, midwest_data$V2, sep = "-"),
paste(midwest_data$V1, midwest_data$V4, sep = "-"),
paste(midwest_data$V1, midwest_data$V6, sep = "-"),
paste(midwest_data$V1, midwest_data$V8, sep = "-"),
paste(midwest_data$V1, midwest_data$V10, sep = "-"))
dates <- dates[!is.na(values)]
values <- values[!is.na(values)]
library(lubridate)
dates <- ymd(dates)
midwest_gas <- data.frame(date = dates, price = values)
midwest_gas <- midwest_gas[with(midwest_gas, order(date)), ]** A better way to do this will be shown later **
library(ggplot2)
qplot(date, price, data = midwest_gas, geom = "line")library(readxl)
download.file(url = "http://bit.ly/2ihV8Ye",
destfile = "midwest.xls")
midwest2 <- read_excel("midwest.xls")
# Fix name issues
names(midwest2) <- make.names(names(midwest2))
head(midwest2, 4)## Source: local data frame [4 x 11]
##
## Year.Month Week.1 NA. Week.2 NA. Week.3
## (chr) (chr) (chr) (chr) (chr) (chr)
## 1 NA End Date Value End Date Value End Date
## 2 1994-Nov NA NA NA NA NA
## 3 1994-Dec 39786.000000 1.086000 39793.000000 1.057000 39800.000000
## 4 1995-Jan 39448.000000 1.025000 39455.000000 1.046000 39462.000000
## Variables not shown: NA. (chr), Week.4 (chr), NA. (chr), Week.5 (chr), NA.
## (chr)
foreign PackageOther file formats can be read using the functions from package foreign
read.spssread.xportread.mtpThe NHANES (National Health and Nutrition Survey) publishes data in the SAS xport format:
http://wwwn.cdc.gov/Nchs/Nhanes/Search/nhanes13_14.aspx
read.xport() to load the file into Rdownload.file("https://wwwn.cdc.gov/Nchs/Nhanes/2013-2014/DEMO_H.XPT", "DEMO.XPT")
library(foreign)
demographic.vars <- read.xport("DEMO.XPT")MINITAB publishes several sample data sets. Download the Paint hardness data
http://support.minitab.com/en-us/datasets/PaintHardness.MTW
and read it in to R.
# download.file("http://support.minitab.com/en-us/datasets/PaintHardness.MTW", "paint.MTW")
# library(foreign)
# demographic.vars <- read.mtp("paint.MTW")plyr package contains the data set baseballdata(baseball, package = "plyr")
help(baseball, package = "plyr")
head(baseball)
## id year stint team lg g ab r h X2b X3b hr rbi sb cs bb so
## 4 ansonca01 1871 1 RC1 25 120 29 39 11 3 0 16 6 2 2 1
## 44 forceda01 1871 1 WS3 32 162 45 45 9 4 0 29 8 0 4 0
## 68 mathebo01 1871 1 FW1 19 89 15 24 3 1 0 10 2 1 2 0
## 99 startjo01 1871 1 NY2 33 161 35 58 5 1 1 34 4 2 3 0
## 102 suttoez01 1871 1 CL1 29 128 35 45 3 7 3 23 3 1 1 0
## 106 whitede01 1871 1 CL1 29 146 40 47 6 5 1 21 2 2 4 1
## ibb hbp sh sf gidp
## 4 NA NA NA NA NA
## 44 NA NA NA NA NA
## 68 NA NA NA NA NA
## 99 NA NA NA NA NA
## 102 NA NA NA NA NA
## 106 NA NA NA NA NAss <- subset(baseball, id == "sosasa01")
head(ss, 3)
## id year stint team lg g ab r h X2b X3b hr rbi sb cs bb
## 66822 sosasa01 1989 1 TEX AL 25 84 8 20 3 0 1 3 0 2 0
## 66823 sosasa01 1989 2 CHA AL 33 99 19 27 5 0 3 10 7 3 11
## 67907 sosasa01 1990 1 CHA AL 153 532 72 124 26 10 15 70 32 16 33
## so ibb hbp sh sf gidp
## 66822 20 0 0 4 0 3
## 66823 27 2 2 1 2 3
## 67907 150 4 6 2 6 10
mean(ss$h / ss$ab)
## [1] 0.2681506There should be an automatic way to calculate this.
for loopsi)result <- rep(NA, length(indexset))
for (i in indexset) {
... some statements ...
result[i] <- ...
}for loops for Baseballplayers <- unique(baseball$id)
n <- length(players)
ba <- rep(NA, n)
for (i in 1:n) {
career <- subset(baseball, id == players[i])
ba[i] <- with(career, mean(h / ab, na.rm = TRUE))
}
summary(ba)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.0000 0.1831 0.2459 0.2231 0.2699 0.5000 6for loops for Baseballi = 0:players <- unique(baseball$id)
n <- length(players)
ba <- rep(NA, n)
head(ba)
## [1] NA NA NA NA NA NAfor loops for Baseballi = 1:players <- unique(baseball$id)
ba <- rep(NA, length(players))
for (i in 1:1) {
career <- subset(baseball, id == players[i])
ba[i] <- with(career, mean(h / ab, na.rm = TRUE))
}
head(ba)
## [1] 0.3371163 NA NA NA NA NAfor loops for Baseballi = 2:players <- unique(baseball$id)
ba <- rep(NA, length(players))
for (i in 1:2) {
career <- subset(baseball, id == players[i])
ba[i] <- with(career, mean(h / ab, na.rm = TRUE))
}
head(ba)
## [1] 0.3371163 0.2489226 NA NA NA NAfor loops for Baseballi = 3:players <- unique(baseball$id)
ba <- rep(NA, length(players))
for (i in 1:3) {
career <- subset(baseball, id == players[i])
ba[i] <- with(career, mean(h / ab, na.rm = TRUE))
}
head(ba)
## [1] 0.3371163 0.2489226 0.2018073 NA NA NAgames and atbatslibrary(dplyr)
data(baseball, package = "plyr")
players <- unique(baseball$id)
ba <- rep(NA, length(players))
games <- rep(NA, length(players))
atbats <- rep(NA, length(players))
for (i in 1:length(players)) {
career <- subset(baseball, id == players[i])
ba[i] <- with(career, mean(h / ab, na.rm = TRUE))
games[i] <- with(career, sum(g, na.rm = TRUE))
atbats[i] <- with(career, sum(ab, na.rm = TRUE))
}
A special function: summarise or summarize
library(dplyr)
baseball <- read.csv("https://bit.ly/2iFyIwL")
summarise(baseball,
ba = mean(h / ab, na.rm = TRUE),
games = sum(g, na.rm = TRUE),
hr = sum(hr, na.rm = TRUE),
ab = sum(ab, na.rm = TRUE))
## ba games hr ab
## 1 0.2339838 1580070 113577 4891061summarise(subset(baseball, id == "sosasa01"),
ba = mean(h / ab, na.rm = TRUE),
games = sum(g, na.rm = TRUE),
hr = sum(hr, na.rm = TRUE),
ab = sum(ab, na.rm = TRUE))
## ba games hr ab
## 1 0.2681506 2354 609 8813dplyr + Summarisecareers <- summarise(group_by(baseball, id),
ba = mean(h / ab, na.rm = TRUE),
games = sum(g, na.rm = TRUE),
homeruns = sum(hr, na.rm = TRUE),
atbats = sum(ab, na.rm = TRUE))
head(careers)
## Source: local data frame [6 x 5]
##
## id ba games homeruns atbats
## (fctr) (dbl) (int) (int) (int)
## 1 aaronha01 0.3010752 3298 755 12364
## 2 abernte02 0.1824394 681 0 181
## 3 adairje01 0.2363071 1165 57 4019
## 4 adamsba01 0.2096513 482 3 1019
## 5 adamsbo03 0.2378073 1281 37 4019
## 6 adcocjo01 0.2751690 1959 336 6606Rather than nesting functions(inside(each(other))), use a pipe %>%:
careers <- baseball %>%
group_by(id) %>%
summarize(
ba = mean(h / ab, na.rm = TRUE),
games = sum(g, na.rm = TRUE),
homeruns = sum(hr, na.rm = TRUE),
atbats = sum(ab, na.rm = TRUE)
)Pipes make code more readable and easier to understand.
a %>% function(b)
is the same as
function(a, b):
the piped variable is substituted for the first function argument.
If the argument passed in is not the first argument, use .:
b %>% function(a, .)
is the same as
function(a, b)
team)
unique) players has the team had?team)
unique players has the team had?baseball %>%
group_by(team) %>%
summarise(
nplayer = length(unique(id)),
first = min(year),
last = max(year))
## Source: local data frame [132 x 4]
##
## team nplayer first last
## (fctr) (int) (int) (int)
## 1 ALT 1 1884 1884
## 2 ANA 29 1997 2004
## 3 ARI 43 1998 2007
## 4 ATL 133 1966 2007
## 5 BAL 158 1954 2007
## 6 BFN 11 1879 1885
## 7 BFP 1 1890 1890
## 8 BL1 5 1872 1874
## 9 BL2 6 1882 1889
## 10 BL3 3 1890 1891
## .. ... ... ... ...players.per.year <- baseball %>%
group_by(team, year) %>%
summarize(nplayer = n()) 80% of data analysis is spent on the process of cleaning and preparing the data.
“Tidy datasets are all alike but every messy dataset is messy in its own way.” – Hadley Wickham
During a ten week sensory experiment, 12 individuals were asked to assess taste of french fries on several scales:
How ____ do the fries taste?
Each week, each individual tasted 6 batches of french fries.
french_fries <- read.csv("https://bit.ly/2hOI890")
head(french_fries)
## time treatment subject rep potato buttery grassy rancid painty
## 1 1 1 3 1 2.9 0.0 0.0 0.0 5.5
## 2 1 1 3 2 14.0 0.0 0.0 1.1 0.0
## 3 1 1 10 1 11.0 6.4 0.0 0.0 0.0
## 4 1 1 10 2 9.9 5.9 2.9 2.2 0.0
## 5 1 1 15 1 1.2 0.1 0.0 1.1 5.1
## 6 1 1 15 2 8.8 3.0 3.6 1.5 2.3This format is not ideal for data analysis
library(ggplot2)
qplot("1_buttery", buttery, data = french_fries, fill = I("red"), geom = "boxplot") +
geom_boxplot(aes(x = "2_grassy", y = grassy), fill = I("orange")) +
geom_boxplot(aes(x = "3_painty", y = painty), fill = I("yellow")) +
geom_boxplot(aes(x = "4_potato", y = potato), fill = I("green")) +
geom_boxplot(aes(x = "5_rancid", y = rancid), fill = I("blue")) +
xlab("variable") + ylab("rating")Data which follows these rules is “tidy” and easier to work with in R:
The french fry data is in wide format
and should be in long format for plotting
This is done using the gather function.
When gathering, specify the keys (identifiers) and the values (measures).
When gathering, specify the keys (identifiers) and the values (measures).
library(tidyr)
french_fries_long <- gather(french_fries,
key = variable,
value = rating,
potato:painty)
head(french_fries_long)
## time treatment subject rep variable rating
## 1 1 1 3 1 potato 2.9
## 2 1 1 3 2 potato 14.0
## 3 1 1 10 1 potato 11.0
## 4 1 1 10 2 potato 9.9
## 5 1 1 15 1 potato 1.2
## 6 1 1 15 2 potato 8.8Plotting is much easier with long format data:
qplot(variable, rating, data = french_fries_long,
fill = variable, geom = "boxplot")In certain applications, a wide dataset may be preferable (e.g. to display in a table).
head(french_fries_long)
## time treatment subject rep variable rating
## 1 1 1 3 1 potato 2.9
## 2 1 1 3 2 potato 14.0
## 3 1 1 10 1 potato 11.0
## 4 1 1 10 2 potato 9.9
## 5 1 1 15 1 potato 1.2
## 6 1 1 15 2 potato 8.8The spread function (from tidyr) is used to do this:
french_fries_wide <- spread(french_fries_long,
key = variable, value = rating)
head(french_fries_wide)
## time treatment subject rep buttery grassy painty potato rancid
## 1 1 1 3 1 0.0 0.0 5.5 2.9 0.0
## 2 1 1 3 2 0.0 0.0 0.0 14.0 1.1
## 3 1 1 10 1 6.4 0.0 0.0 11.0 0.0
## 4 1 1 10 2 5.9 2.9 0.0 9.9 2.2
## 5 1 1 15 1 0.1 0.0 5.1 1.2 1.1
## 6 1 1 15 2 3.0 3.6 2.3 8.8 1.5Read in the billboard top 100 music data
billboard <- read.csv("https://bit.ly/2hQGDao")tidyr to convert this data into a long format.ggplot2 to create this time series plot:tidyr to convert this data into a long format.long_billboard <- gather(billboard,
key = week, value = rank,
X1:X76)
# Convert weeks to numeric variables
long_billboard$week <- long_billboard$week %>%
gsub("X", "", .) %>%
as.numeric()
# Get rid of NAs:
long_billboard <- long_billboard %>% na.omit()ggplot2 to create this time series plot:qplot(x = week, y = rank,
colour = artist, group = track,
data = long_billboard, geom = "line")dplyrlibrary(dplyr)
french_fries_rating <- french_fries_long %>%
# SPLIT:
group_by(variable) %>%
# APPLY + COMBINE:
summarize(rating = mean(rating, na.rm = T))
french_fries_rating
## Source: local data frame [5 x 2]
##
## variable rating
## (chr) (dbl)
## 1 buttery 1.8236994
## 2 grassy 0.6641727
## 3 painty 2.5217579
## 4 potato 6.9525180
## 5 rancid 3.8522302dplyr verbsThere are five primary dplyr verbs, representing distinct data analysis tasks:
filter: Keep only a subset of a data framearrange: Reorder the rows of a data frameselect: Select particular columns of a data framemutate: Add new columns computed from existing colssummarise: Create collapsed summaries of a data framefrench_fries %>%
filter(subject == 3, time == 1)
## time treatment subject rep potato buttery grassy rancid painty
## 1 1 1 3 1 2.9 0.0 0.0 0.0 5.5
## 2 1 1 3 2 14.0 0.0 0.0 1.1 0.0
## 3 1 2 3 1 13.9 0.0 0.0 3.9 0.0
## 4 1 2 3 2 13.4 0.1 0.0 1.5 0.0
## 5 1 3 3 1 14.1 0.0 0.0 1.1 0.0
## 6 1 3 3 2 9.5 0.0 0.6 2.8 0.0french_fries %>%
arrange(desc(rancid)) %>%
head
## time treatment subject rep potato buttery grassy rancid painty
## 1 9 2 51 1 7.3 2.3 0 14.9 0.1
## 2 10 1 86 2 0.7 0.0 0 14.3 13.1
## 3 5 2 63 1 4.4 0.0 0 13.8 0.6
## 4 9 2 63 1 1.8 0.0 0 13.7 12.3
## 5 5 2 19 2 5.5 4.7 0 13.4 4.6
## 6 4 3 63 1 5.6 0.0 0 13.3 4.4french_fries %>%
select(time, treatment, subject, rep, potato) %>%
head
## time treatment subject rep potato
## 1 1 1 3 1 2.9
## 2 1 1 3 2 14.0
## 3 1 1 10 1 11.0
## 4 1 1 10 2 9.9
## 5 1 1 15 1 1.2
## 6 1 1 15 2 8.8french_fries %>%
mutate(rancid2 = rancid^2) %>%
select(time, treatment, subject, rancid, rancid2) %>%
head
## time treatment subject rancid rancid2
## 1 1 1 3 0.0 0.00
## 2 1 1 3 1.1 1.21
## 3 1 1 10 0.0 0.00
## 4 1 1 10 2.2 4.84
## 5 1 1 15 1.1 1.21
## 6 1 1 15 1.5 2.25french_fries %>%
group_by(time, treatment) %>%
summarise(mean_rancid = mean(rancid),
sd_rancid = sd(rancid))
## Source: local data frame [30 x 4]
## Groups: time [?]
##
## time treatment mean_rancid sd_rancid
## (int) (int) (dbl) (dbl)
## 1 1 1 2.758333 3.212870
## 2 1 2 1.716667 2.714801
## 3 1 3 2.600000 3.202037
## 4 2 1 3.900000 4.374730
## 5 2 2 2.141667 3.117540
## 6 2 3 2.495833 3.378767
## 7 3 1 4.650000 3.933358
## 8 3 2 2.895833 3.773532
## 9 3 3 3.600000 3.592867
## 10 4 1 2.079167 2.394737
## .. ... ... ... ...This dataset contains information on over 300,000 flights that departed from New York City in the year 2013.
flights <- read.csv("https://bit.ly/2hzhAfW")dplyr and the pipe operator (%>%), create a data frame consisting of the average arrival delay (arr_delay) based on the destination airport (dest).arr_delay) based on the destination airport (dest).flights %>%
group_by(dest) %>%
summarise(avg_delay = mean(na.omit(arr_delay))) %>%
arrange(desc(avg_delay))
## Source: local data frame [105 x 2]
##
## dest avg_delay
## (fctr) (dbl)
## 1 CAE 41.76415
## 2 TUL 33.65986
## 3 OKC 30.61905
## 4 JAC 28.09524
## 5 TYS 24.06920
## 6 MSN 20.19604
## 7 RIC 20.11125
## 8 CAK 19.69834
## 9 DSM 19.00574
## 10 GRR 18.18956
## .. ... ...by.origin <- flights %>%
group_by(carrier, origin) %>%
summarise(count = n()) %>%
ungroup() %>% group_by(carrier) %>%
filter(count == max(count)) %>%
rename(mostused = origin) %>%
mutate(kind = "origin")
by.dest <- flights %>%
group_by(carrier, dest) %>%
summarise(count = n()) %>%
ungroup() %>% group_by(carrier) %>%
filter(count == max(count)) %>%
rename(mostused = dest) %>%
mutate(kind = "dest")mostused <- bind_rows(by.origin,by.dest)
## Warning in rbind_all(x, .id): Unequal factor levels: coercing to character
mostused %>% group_by(carrier) %>%
filter(count == max(count)) %>% ungroup() %>%
arrange(desc(count))
## Source: local data frame [19 x 4]
##
## carrier mostused count kind
## (fctr) (chr) (int) (chr)
## 1 UA EWR 46087 origin
## 2 EV EWR 43939 origin
## 3 B6 JFK 42076 origin
## 4 DL LGA 23067 origin
## 5 MQ LGA 16928 origin
## 6 AA LGA 15459 origin
## 7 9E JFK 14651 origin
## 8 US LGA 13136 origin
## 9 WN EWR 6188 origin
## 10 VX JFK 3596 origin
## 11 FL LGA 3260 origin
## 12 AS EWR 714 origin
## 13 AS SEA 714 dest
## 14 F9 LGA 685 origin
## 15 F9 DEN 685 dest
## 16 YV LGA 601 origin
## 17 HA JFK 342 origin
## 18 HA HNL 342 dest
## 19 OO LGA 26 originDates are deceptively hard to work with.
Example: 02/05/2012. Is it February 5th, or May 2nd?
Other things are difficult too:
The lubridate package helps tackle some of these issues.
library(lubridate)
now()
today()
now() + hours(4)
today() - days(2)
## [1] "2016-12-29 10:14:08 CST"
## [1] "2016-12-29"
## [1] "2016-12-29 14:14:08 CST"
## [1] "2016-12-27"ymd("2013-05-14")
mdy("05/14/2013")
dmy("14052013")
ymd_hms("2013:05:14 14:50:30", tz = "America/Chicago")
## [1] "2013-05-14 UTC"
## [1] "2013-05-14 UTC"
## [1] "2013-05-14 UTC"
## [1] "2013-05-14 14:50:30 CDT"flights data, create a new column Date using lubridate. Paste together the columns year, month, and day in order to do this. See the paste function.dplyr to calculate the average departure delay for each date.flights data, create a new column Date using lubridate.flights$date <- paste(flights$year, flights$month, flights$day,
sep = "-") %>%
ymd() # Convert from character to date formatdplyr to calculate the average departure delay for each date.delay.dat <- flights %>%
group_by(date) %>%
summarise(dep_delay = mean(dep_delay, na.rm = TRUE))qplot(date, dep_delay, geom = "line", data = delay.dat)